# importing the packages I am intending to use.
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import calendar
import math
import time
pio.renderers.default = "notebook"
import warnings
warnings.simplefilter("ignore")
# read my csv file into Dataframe
df = pd.read_csv("data/201902-fordgobike-tripdata.csv")
# show the head of the df to make sure it's read correctly
df.head()
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.1450 | 2019-03-01 08:01:55.9750 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | -122.402923 | 4902 | Customer | 1984.0 | Male | No |
| 1 | 42521 | 2019-02-28 18:53:21.7890 | 2019-03-01 06:42:03.0560 | 23.0 | The Embarcadero at Steuart St | 37.791464 | -122.391034 | 81.0 | Berry St at 4th St | 37.775880 | -122.393170 | 2535 | Customer | NaN | NaN | No |
| 2 | 61854 | 2019-02-28 12:13:13.2180 | 2019-03-01 05:24:08.1460 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | -122.404904 | 5905 | Customer | 1972.0 | Male | No |
| 3 | 36490 | 2019-02-28 17:54:26.0100 | 2019-03-01 04:02:36.8420 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | -122.444293 | 6638 | Subscriber | 1989.0 | Other | No |
| 4 | 1585 | 2019-02-28 23:54:18.5490 | 2019-03-01 00:20:44.0740 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | -122.248780 | 4898 | Subscriber | 1974.0 | Male | Yes |
# high-level overview of data shape and composition
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 183412 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 183412 non-null int64 1 start_time 183412 non-null object 2 end_time 183412 non-null object 3 start_station_id 183215 non-null float64 4 start_station_name 183215 non-null object 5 start_station_latitude 183412 non-null float64 6 start_station_longitude 183412 non-null float64 7 end_station_id 183215 non-null float64 8 end_station_name 183215 non-null object 9 end_station_latitude 183412 non-null float64 10 end_station_longitude 183412 non-null float64 11 bike_id 183412 non-null int64 12 user_type 183412 non-null object 13 member_birth_year 175147 non-null float64 14 member_gender 175147 non-null object 15 bike_share_for_all_trip 183412 non-null object dtypes: float64(7), int64(2), object(7) memory usage: 22.4+ MB
# checking the names of the columns in the dataset
df.columns
Index(['duration_sec', 'start_time', 'end_time', 'start_station_id',
'start_station_name', 'start_station_latitude',
'start_station_longitude', 'end_station_id', 'end_station_name',
'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type',
'member_birth_year', 'member_gender', 'bike_share_for_all_trip'],
dtype='object')
# dopping list of columns inplace
columns_to_be_dropped = ['start_station_id', 'start_station_latitude', 'start_station_longitude', 'end_station_id', 'end_station_latitude', 'end_station_longitude', 'bike_share_for_all_trip']
df.drop(columns_to_be_dropped, axis = 1, inplace=True)
df.head(1)
| duration_sec | start_time | end_time | start_station_name | end_station_name | bike_id | user_type | member_birth_year | member_gender | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.1450 | 2019-03-01 08:01:55.9750 | Montgomery St BART Station (Market St at 2nd St) | Commercial St at Montgomery St | 4902 | Customer | 1984.0 | Male |
# find numnber of duplicates in the DF
df.duplicated().sum()
0
#check for null values
df.isnull().sum()
duration_sec 0 start_time 0 end_time 0 start_station_name 197 end_station_name 197 bike_id 0 user_type 0 member_birth_year 8265 member_gender 8265 dtype: int64
# drop rows with null values
df.dropna(inplace=True)
# check for null values
df.isnull().sum().any()
False
df.shape
(174952, 9)
#checking for data types
df.dtypes
duration_sec int64 start_time object end_time object start_station_name object end_station_name object bike_id int64 user_type object member_birth_year float64 member_gender object dtype: object
Some Data types needs to be converted
Since this data is only for Feburary I can hardcode the month but I will make it in a way as if you have the same data set for another month you can still use the same code
# 1. Convert time variables from string to datetime
df['start_time'] = pd.to_datetime(df['start_time'])
df['start_month'] = df['start_time'].apply(lambda time: time.month)
# Checking the values
print(df['start_month'].value_counts())
2 174952 Name: start_month, dtype: int64
While inspecting I found that data can have better context if we added columns for working hours and weekdays and age¶
# 2. Convert start hour to working/non-working hours
df['start_hour'] = df['start_time'].apply(lambda time: time.hour)
df['is_working_hour'] = 0
df['is_working_hour'][(df['start_hour'] >= 9) & (df['start_hour'] < 17)] = 1
# Checking values where 1 is working hour and 0 is non working hour
print(df['start_hour'].value_counts())
print(df['is_working_hour'].value_counts())
17 20904 8 20227 18 16118 9 15204 16 13473 7 10231 19 9424 15 8646 12 8220 13 8098 10 7970 14 7677 11 7461 20 6211 21 4400 6 3293 22 2793 23 1572 0 893 5 866 1 525 2 355 4 227 3 164 Name: start_hour, dtype: int64 0 98203 1 76749 Name: is_working_hour, dtype: int64
# Days of the week
df['is_week_day'] = df['start_time'].apply(lambda time: time.dayofweek)
#mapping the return value to 0 and 1 based on it's a weekday or not
dict_map = {0:1,1:1,2:1,3:1,4:1,5:0,6:0}
df['is_week_day'] = df['is_week_day'].map(dict_map)
# add age column
df['age'] = df['member_birth_year'].apply(lambda x: 2019 - x)
df['age'].describe()
count 174952.000000 mean 34.196865 std 10.118731 min 18.000000 25% 27.000000 50% 32.000000 75% 39.000000 max 141.000000 Name: age, dtype: float64
We can see that there is a max of 141 year old which seems unrealistic¶
# Dropping unrealistic age data
df.drop(df.query('member_birth_year < 1925').index, inplace = True)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 174877 entries, 0 to 183411 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174877 non-null int64 1 start_time 174877 non-null datetime64[ns] 2 end_time 174877 non-null object 3 start_station_name 174877 non-null object 4 end_station_name 174877 non-null object 5 bike_id 174877 non-null int64 6 user_type 174877 non-null object 7 member_birth_year 174877 non-null float64 8 member_gender 174877 non-null object 9 start_month 174877 non-null int64 10 start_hour 174877 non-null int64 11 is_working_hour 174877 non-null int64 12 is_week_day 174877 non-null int64 13 age 174877 non-null float64 dtypes: datetime64[ns](1), float64(2), int64(6), object(5) memory usage: 20.0+ MB
The dataset is now clean and ready to be analysed.¶
# creating two dataframes for weekdays and weekends
df_weekday = df.query('is_week_day == True')
df_weekend = df.query('is_week_day == False')
# plotting a histogram for number of trips in weekdays vs hours
fig = px.histogram(df_weekday, x="start_hour", title='Number of trips in weekdays')
fig.update_layout(bargap=0.1,yaxis_title="Number of trips", xaxis_title="Hour")
fig.update_xaxes(dtick=1)
fig.show()
# plotting a histogram for number of trips in weekend vs hours
fig = px.histogram(df_weekend, x="start_hour", title='Number of trips on weekends')
fig.update_layout(bargap=0.1,yaxis_title="Number of trips", xaxis_title="Hour")
fig.update_xaxes(dtick=1)
fig.show()
# top 10 starting stations names
top_10_start_station = df.start_station_name.value_counts().head(10).index
# I used this refrence https://stackoverflow.com/a/12098586
df_top10_stations = df[df['start_station_name'].isin(top_10_start_station)]
# plotting a histogram of for top 10 busiest stations
fig = px.histogram(df_top10_stations, y="start_station_name", title='Top 10 busiest stations').update_yaxes(categoryorder='total ascending')
fig.update_layout(bargap=0.2,yaxis_title="Station name", xaxis_title="Usage frequency")
fig.update_yaxes(dtick=1)
fig.show()
df_Market_St_at_10th_St = df.query('start_station_name == "Market St at 10th St"')
# plotting a histogram of the usage frequncy at Market St at 10th St station
fig = px.histogram(df_Market_St_at_10th_St, x="start_hour", title='Market St at 10th St station usage frequency')
fig.update_layout(bargap=0.1,yaxis_title="Number of trips", xaxis_title="Hour")
fig.update_xaxes(dtick=1)
fig.show()
# df for average trip duration with respect to user type
df_avg_trip = df.groupby(by="user_type").mean()
fig = px.bar(df_avg_trip, x = df_avg_trip.index, y = df_avg_trip.duration_sec)
fig.update_layout(
title='Average trip duration for each user type',
xaxis_title = "User type",
yaxis_title = "Average duration of trip",
)
fig.show()
# plotting a histogram for number of trips with respect to user type
fig = px.histogram(df, x="user_type", title='Number of trips for each user type')
fig.update_layout(bargap=0.1,yaxis_title="Number of trips", xaxis_title="User type")
fig.update_xaxes(dtick=1)
fig.show()
df_avg_trip_time = df.groupby(by="start_hour").mean()
fig = px.bar(df_avg_trip_time, x = df_avg_trip_time.index, y = df_avg_trip_time.duration_sec)
fig.update_layout(
title='Average trip duration time along the day',
xaxis_title = "Hour",
yaxis_title = "Number of trips",
showlegend=False
)
fig.update_xaxes(dtick=1)
fig.show()
# df where trip duration is below 4000
df_dur_10 = df.query("duration_sec <= 4000")
df_dur_10.shape
(173679, 14)
fig = px.scatter(df_dur_10, y="duration_sec", x="age", color = "member_gender",opacity=0.5, labels={"member_gender": "Gender"})
fig.update_layout(
title='Trip duration with respect to gender and age',
xaxis_title = "Age",
yaxis_title = "Trip duration is seconds",
)
fig.update_traces(marker_size=10)
fig.show()
fig = px.box(df, x="user_type", y="age", color="member_gender",labels={"member_gender": "Gender"},
notched=True,
title="Is there any correlation between age, gender and user type")
fig.update_layout(
xaxis_title = "User Type",
yaxis_title = "Age",
)
fig.show()